# coding:utf-8
import re
import urlparse
from bs4 import BeautifulSoup
import urllib

class HtmlParser(object):
    def parser(self,page_url,html_cont):
        '''
        用于解析网页内容，抽取url和数据
        '''
        if page_url is None or html_cont is None:
            return
        soup=BeautifulSoup(html_cont,'html.parser',from_encoding='urf-8')
        new_urls=self._get_new_urls(page_url,soup)
        new_data=self._get_new_data(page_url,soup)
        return new_urls,new_data

    def _get_new_urls(self,page_url,soup):
        '''
        抽取新的url集合
        '''
        new_urls=set()
        #抽取符合要求的a标签
        links=soup.find_all('a',href=re.compile(r'/item/(\%+)'))
        for link in links:
            #print link
            #抽取href属性
            new_url=link['href']
            #url = urllib.unquote(new_url)#解码
            #拼接成完整网站
            new_full_url="http://baike.baidu.com"+new_url #urlparse.urljoin(page_url,new_url)
            print new_full_url
            new_urls.add(new_full_url)
        return new_urls

    def _get_new_data(self,page_url,soup):
        '''
        抽取有效数据
        '''
        data={}
        data['url']=page_url
        title=soup.find('dd',class_='lemmaWgt-lemmaTitle-title').find('h1')
        data['title']=title.get_text()
        summary=soup.find('div',class_='lemma-summary')
        #获取tag中包含的所有文本内容，包括子孙tag中的内容，并且将结果作为unicode字符串返回
        data['summary']=summary.get_text()
        return data